R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

data <- read.csv("obesity.csv")
data$BMI <- round(data$Weight / (data$Height^2), 2)
head(data,10)
##    Age Gender Height Weight       CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1   21 Female   1.62   64.0         no   no    2   3  no    no    2
## 2   21 Female   1.52   56.0  Sometimes   no    3   3 yes   yes    3
## 3   23   Male   1.80   77.0 Frequently   no    2   3  no    no    2
## 4   27   Male   1.80   87.0 Frequently   no    3   3  no    no    2
## 5   22   Male   1.78   89.8  Sometimes   no    2   1  no    no    2
## 6   29   Male   1.62   53.0  Sometimes  yes    2   3  no    no    2
## 7   23 Female   1.50   55.0  Sometimes  yes    3   3  no    no    2
## 8   22   Male   1.64   53.0  Sometimes   no    2   3  no    no    2
## 9   24   Male   1.78   64.0 Frequently  yes    3   3  no    no    2
## 10  22   Male   1.72   68.0         no  yes    2   3  no    no    2
##    family_history_with_overweight FAF TUE      CAEC                MTRANS
## 1                             yes   0   1 Sometimes Public_Transportation
## 2                             yes   3   0 Sometimes Public_Transportation
## 3                             yes   2   1 Sometimes Public_Transportation
## 4                              no   2   0 Sometimes               Walking
## 5                              no   0   0 Sometimes Public_Transportation
## 6                              no   0   0 Sometimes            Automobile
## 7                             yes   1   0 Sometimes             Motorbike
## 8                              no   3   0 Sometimes Public_Transportation
## 9                             yes   1   1 Sometimes Public_Transportation
## 10                            yes   1   1 Sometimes Public_Transportation
##             NObeyesdad   BMI
## 1        Normal_Weight 24.39
## 2        Normal_Weight 24.24
## 3        Normal_Weight 23.77
## 4   Overweight_Level_I 26.85
## 5  Overweight_Level_II 28.34
## 6        Normal_Weight 20.20
## 7        Normal_Weight 24.44
## 8        Normal_Weight 19.71
## 9        Normal_Weight 20.20
## 10       Normal_Weight 22.99
str(data)
## 'data.frame':    2111 obs. of  18 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
##  $ BMI                           : num  24.4 24.2 23.8 26.9 28.3 ...
unique(data$NObeyesdad)
## [1] "Normal_Weight"       "Overweight_Level_I"  "Overweight_Level_II"
## [4] "Obesity_Type_I"      "Insufficient_Weight" "Obesity_Type_II"    
## [7] "Obesity_Type_III"
overweight_data <- data[data$NObeyesdad == "Overweight_Level_I", ]

# Printing the number of rows in the filtered data
print(nrow(overweight_data))
## [1] 290
pie_colors <- c("lightblue", "lightgreen", "orange", "brown", "lavender", "lightyellow", "lightpink")
target_count <- table(data$NObeyesdad)
target_count <- as.data.frame(target_count)
colnames(target_count) <- c("NObeyesdad", "Count")


target_count$Percentage <- target_count$Count / sum(target_count$Count) * 100
# Create a pie chart with ggplot2
ggplot(target_count, aes(x = "", y = Count, fill = NObeyesdad)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  scale_fill_manual(values = pie_colors) +
  labs(title = "Distribution of Obesity Levels") +
  theme_void() +
  theme(legend.position = "right") +
  guides(fill = guide_legend(title = "Obesity Levels")) +
  geom_text(
    aes(label = sprintf("%.1f%%", Percentage)),
    position = position_stack(vjust = 0.5),    
    size = 4                                   
  )

# Create a bar plot for obesity types
g <- ggplot(data, aes(x=NObeyesdad))
g + geom_bar(aes(fill=NObeyesdad), width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title="Histogram on Categorical Variable", 
       subtitle="Distribution of Different Obesity Types",
       x = "Obesity Type", y = "Count")

theme_set(theme_classic())

# Subset for specific types of obesity
obesity_type <- data[data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), ]
head(obesity_type)
##    Age Gender Height Weight       CALC FAVC FCVC NCP SCC SMOKE CH2O
## 11  26   Male   1.85    105  Sometimes  yes    3   3  no    no    3
## 14  41   Male   1.80     99 Frequently  yes    2   3  no    no    2
## 18  29 Female   1.53     78         no  yes    2   1  no    no    2
## 22  52 Female   1.69     87         no  yes    3   1  no   yes    2
## 24  22 Female   1.60     82  Sometimes  yes    1   1  no    no    2
## 68  23   Male   1.65     95 Frequently  yes    2   3  no    no    2
##    family_history_with_overweight FAF TUE       CAEC                MTRANS
## 11                            yes   2   2 Frequently Public_Transportation
## 14                             no   2   1  Sometimes            Automobile
## 18                             no   0   0  Sometimes            Automobile
## 22                            yes   0   0  Sometimes            Automobile
## 24                            yes   0   2  Sometimes Public_Transportation
## 68                            yes   0   1     Always            Automobile
##        NObeyesdad   BMI
## 11 Obesity_Type_I 30.68
## 14 Obesity_Type_I 30.56
## 18 Obesity_Type_I 33.32
## 22 Obesity_Type_I 30.46
## 24 Obesity_Type_I 32.03
## 68 Obesity_Type_I 34.89
# Subset for specific levels of overweight
overweight_level <- data[data$NObeyesdad %in% c("Overweight_Level_I", "Overweight_Level_II"), ]
head(overweight_level)
##    Age Gender Height Weight       CALC FAVC FCVC NCP SCC SMOKE CH2O
## 4   27   Male   1.80   87.0 Frequently   no    3   3  no    no    2
## 5   22   Male   1.78   89.8  Sometimes   no    2   1  no    no    2
## 12  21 Female   1.72   80.0  Sometimes  yes    2   3 yes    no    2
## 17  27   Male   1.93  102.0  Sometimes  yes    2   1  no    no    1
## 19  30 Female   1.71   82.0         no  yes    3   4  no   yes    1
## 20  23 Female   1.65   70.0  Sometimes   no    2   1  no    no    2
##    family_history_with_overweight FAF TUE       CAEC                MTRANS
## 4                              no   2   0  Sometimes               Walking
## 5                              no   0   0  Sometimes Public_Transportation
## 12                            yes   2   1 Frequently Public_Transportation
## 17                            yes   1   0  Sometimes Public_Transportation
## 19                            yes   0   0 Frequently            Automobile
## 20                            yes   0   0  Sometimes Public_Transportation
##             NObeyesdad   BMI
## 4   Overweight_Level_I 26.85
## 5  Overweight_Level_II 28.34
## 12 Overweight_Level_II 27.04
## 17 Overweight_Level_II 27.38
## 19 Overweight_Level_II 28.04
## 20  Overweight_Level_I 25.71
others <- data[!data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III", "Overweight_Level_I", "Overweight_Level_II"), ]
head(others)
##   Age Gender Height Weight       CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1  21 Female   1.62     64         no   no    2   3  no    no    2
## 2  21 Female   1.52     56  Sometimes   no    3   3 yes   yes    3
## 3  23   Male   1.80     77 Frequently   no    2   3  no    no    2
## 6  29   Male   1.62     53  Sometimes  yes    2   3  no    no    2
## 7  23 Female   1.50     55  Sometimes  yes    3   3  no    no    2
## 8  22   Male   1.64     53  Sometimes   no    2   3  no    no    2
##   family_history_with_overweight FAF TUE      CAEC                MTRANS
## 1                            yes   0   1 Sometimes Public_Transportation
## 2                            yes   3   0 Sometimes Public_Transportation
## 3                            yes   2   1 Sometimes Public_Transportation
## 6                             no   0   0 Sometimes            Automobile
## 7                            yes   1   0 Sometimes             Motorbike
## 8                             no   3   0 Sometimes Public_Transportation
##      NObeyesdad   BMI
## 1 Normal_Weight 24.39
## 2 Normal_Weight 24.24
## 3 Normal_Weight 23.77
## 6 Normal_Weight 20.20
## 7 Normal_Weight 24.44
## 8 Normal_Weight 19.71
theme_set(theme_bw())

# plot
g <- ggplot(data = data, aes(x=Age, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() + 
  labs(title="Age vs NObeyesdad",
       caption="Source: mpg",
       x="Age",
       y="NObeyesdad")

ggplot(data, aes(x=Age, fill=NObeyesdad)) +
  geom_density(alpha=0.4)+
  xlim(14,40)+labs(title='Density Plot for Age',x='Age',y='Density')+theme_minimal()
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(others, aes(x = Age)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Age Distribution for Normal Weight and Insufficient Types")+
    theme(axis.text = element_text(size = 6))

ggplot(obesity_type, aes(x = Age)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Age Distribution for Different Obesity Types")+
    theme(axis.text = element_text(size = 6))

ggplot(overweight_level, aes(x = Age)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Age Distribution for Different Overweight Types")+
    theme(axis.text = element_text(size = 6))

g <- ggplot(data = data, aes(x=Weight, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() + 
  labs(title="Weight vs NObeyesdad",
       caption="Source: mpg",
       x="Weight",
       y="NObeyesdad")

ggplot(data, aes(x=Weight, fill=NObeyesdad)) +
  geom_density(alpha=0.4)+
  xlim(30,170)+labs(title='Density Plot for Weight',x='Weight',y='Density')+theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

ggplot(others, aes(x = Weight)) +
  geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Weight Distribution for Normal Weight and Insufficient Types")+
    theme(axis.text = element_text(size = 6))

ggplot(obesity_type, aes(x = Weight)) +
  geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Weight Distribution for Different Obesity Types")+
    theme(axis.text = element_text(size = 6))

ggplot(overweight_level, aes(x = Weight)) +
  geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Weight Distribution for Different Overweight Types")+
    theme(axis.text = element_text(size = 6))

g <- ggplot(data = data, aes(x=Height, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() + 
  labs(title="Height vs NObeyesdad",
       caption="Source: mpg",
       x="Height",
       y="NObeyesdad")

ggplot(data, aes(x=Height, fill=NObeyesdad)) +
  geom_density(alpha=0.4)+
  xlim(1.4,2.2)+labs(title='Density Plot for Height',x='Height',y='Density')+theme_minimal()

ggplot(others, aes(x = Height)) +
  geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Height Distribution for Normal Weight and Insufficient Types")+
    theme(axis.text = element_text(size = 6))

ggplot(obesity_type, aes(x = Height)) +
  geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Height Distribution for Different Obesity Types")+
    theme(axis.text = element_text(size = 6))

ggplot(overweight_level, aes(x = Height)) +
  geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
  facet_wrap(~NObeyesdad, scales = "free") +
  labs(title = "Age Distribution for Different Overweight Types")+
    theme(axis.text = element_text(size = 6))

ggplot(data, aes(x=BMI, fill=NObeyesdad)) +
  geom_density(alpha=0.4)+
  xlim(-0,60)+labs(title='Density Plot for BMI',x='BMI',y='Density')+theme_minimal()

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = Gender), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title = "Distribution of Different Obesity Types by Gender",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = FAVC), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title ="Distribution of Different Obesity Types by FAVC",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = CALC), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title = "Distribution of Different Obesity Types by CALC",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = SCC), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title = "Distribution of Different Obesity Types by SCC",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = SMOKE), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title = "Distribution of Different Obesity Types by SMOKE",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = CAEC), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title ="Distribution of Different Obesity Types by CAEC",
       x = "Obesity Type", y = "Count")

g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = MTRANS), width = 0.5, position = "dodge") + 
  theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
  labs(title = "Distribution of Different Obesity Types by MTRANS",
       x = "Obesity Type", y = "Count")

ggplot(data, aes(x = FCVC, y =NObeyesdad )) +
  geom_point() + 
  labs(title = "Scatterplot of Obesity Levels vs. FCVC",
       x = "Frequency of Vegetables Taken", 
       y = "Obesity Levels") +
  theme_minimal()

ggplot(data, aes(x = CH2O, y =NObeyesdad )) +
  geom_point() + 
  labs(title = "Scatterplot of Obesity Levels vs. FCVC",
       x = "Frequency of Water Consumed", 
       y = "Obesity Levels") +
  theme_minimal()

ggplot(data, aes(x = FAF, y =NObeyesdad )) +
  geom_point() + 
  labs(title = "Scatterplot of Obesity Levels vs. FCVC",
       x = "Frequency of Physical Activity Done", 
       y = "Obesity Levels") +
  theme_minimal()

ggplot(data, aes(x = TUE, y =NObeyesdad )) +
  geom_point() + 
  labs(title = "Scatterplot of Obesity Levels vs. FCVC",
       x = "Frequency of Usage of Technological Devices", 
       y = "Obesity Levels") +
  theme_minimal()

# Convert character variables to factors
data$Gender <- factor(data$Gender, levels = unique(data$Gender))
data$CALC <- factor(data$CALC, levels = unique(data$CALC))
data$FAVC <- factor(data$FAVC, levels = unique(data$FAVC))
data$SCC <- factor(data$SCC, levels = unique(data$SCC))
data$SMOKE <- factor(data$SMOKE, levels = unique(data$SMOKE))
data$family_history_with_overweight <- factor(data$family_history_with_overweight, levels = unique(data$family_history_with_overweight))
data$CAEC <- factor(data$CAEC, levels = unique(data$CAEC))
data$MTRANS <- factor(data$MTRANS, levels = unique(data$MTRANS))
data$NObeyesdad <- factor(data$NObeyesdad, levels = unique(data$NObeyesdad))

# Check the structure of the data frame
str(data)
## 'data.frame':    2111 obs. of  18 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ CALC                          : Factor w/ 4 levels "no","Sometimes",..: 1 2 3 3 2 2 2 2 3 1 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "yes","no": 1 1 1 2 2 2 1 2 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : Factor w/ 4 levels "Sometimes","Frequently",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ MTRANS                        : Factor w/ 5 levels "Public_Transportation",..: 1 1 1 2 1 3 4 1 1 1 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Normal_Weight",..: 1 1 1 2 3 1 1 1 1 1 ...
##  $ BMI                           : num  24.4 24.2 23.8 26.9 28.3 ...
numeric_data <- data[sapply(data, is.numeric)]

correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs")  # Handles missing data

# Print the correlation matrix
print(correlation_matrix)
##                Age      Height      Weight        FCVC         NCP        CH2O
## Age     1.00000000 -0.02595813  0.20256010  0.01629089 -0.04394373 -0.04530386
## Height -0.02595813  1.00000000  0.46313612 -0.03812106  0.24367173  0.21337592
## Weight  0.20256010  0.46313612  1.00000000  0.21612471  0.10746899  0.20057539
## FCVC    0.01629089 -0.03812106  0.21612471  1.00000000  0.04221630  0.06846147
## NCP    -0.04394373  0.24367173  0.10746899  0.04221630  1.00000000  0.05708800
## CH2O   -0.04530386  0.21337592  0.20057539  0.06846147  0.05708800  1.00000000
## FAF    -0.14493833  0.29470900 -0.05143627  0.01993940  0.12950431  0.16723649
## TUE    -0.29693059  0.05191167 -0.07156136 -0.10113485  0.03632557  0.01196534
## BMI     0.24414923  0.13178799  0.93480707  0.26364870  0.03997038  0.14418710
##                FAF         TUE         BMI
## Age    -0.14493833 -0.29693059  0.24414923
## Height  0.29470900  0.05191167  0.13178799
## Weight -0.05143627 -0.07156136  0.93480707
## FCVC    0.01993940 -0.10113485  0.26364870
## NCP     0.12950431  0.03632557  0.03997038
## CH2O    0.16723649  0.01196534  0.14418710
## FAF     1.00000000  0.05856207 -0.17752763
## TUE     0.05856207  1.00000000 -0.09970563
## BMI    -0.17752763 -0.09970563  1.00000000
# Melt the correlation matrix to make it suitable for ggplot2
melted_corr <- melt(correlation_matrix)

# Create a heatmap of the correlation matrix
ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +  # Adds white borders to separate the tiles
  geom_text(aes(label = sprintf("%.2f", value)), size = 3, color = "black") +  # Adds correlation coefficients as labels
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, 
                       limits = c(-1, 1), space = "Lab", 
                       name = "Correlation") +  # Sets the color gradient
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    axis.text.y = element_text(angle = 0, vjust = 0.5),
    axis.title = element_blank(),  # Removes axis titles
    legend.position = "bottom"
  ) +
  labs(fill = 'Correlation Coefficient')  # Legend label

Including Plots

You can also embed plots, for example:

library(e1071)
library(gmodels)
fwd<-regsubsets(NObeyesdad~.,data=data,nvmax=16, method='forward')
summary(fwd)
## Subset selection object
## Call: regsubsets.formula(NObeyesdad ~ ., data = data, nvmax = 16, method = "forward")
## 24 Variables  (and intercept)
##                                  Forced in Forced out
## Age                                  FALSE      FALSE
## GenderMale                           FALSE      FALSE
## Height                               FALSE      FALSE
## Weight                               FALSE      FALSE
## CALCSometimes                        FALSE      FALSE
## CALCFrequently                       FALSE      FALSE
## CALCAlways                           FALSE      FALSE
## FAVCyes                              FALSE      FALSE
## FCVC                                 FALSE      FALSE
## NCP                                  FALSE      FALSE
## SCCyes                               FALSE      FALSE
## SMOKEyes                             FALSE      FALSE
## CH2O                                 FALSE      FALSE
## family_history_with_overweightno     FALSE      FALSE
## FAF                                  FALSE      FALSE
## TUE                                  FALSE      FALSE
## CAECFrequently                       FALSE      FALSE
## CAECAlways                           FALSE      FALSE
## CAECno                               FALSE      FALSE
## MTRANSWalking                        FALSE      FALSE
## MTRANSAutomobile                     FALSE      FALSE
## MTRANSMotorbike                      FALSE      FALSE
## MTRANSBike                           FALSE      FALSE
## BMI                                  FALSE      FALSE
## 1 subsets of each size up to 16
## Selection Algorithm: forward
##           Age GenderMale Height Weight CALCSometimes CALCFrequently CALCAlways
## 1  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 2  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 3  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 4  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 5  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 6  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 7  ( 1 )  " " " "        " "    " "    " "           "*"            " "       
## 8  ( 1 )  " " " "        " "    " "    " "           "*"            " "       
## 9  ( 1 )  " " " "        " "    " "    " "           "*"            " "       
## 10  ( 1 ) "*" " "        " "    " "    " "           "*"            " "       
## 11  ( 1 ) "*" " "        " "    " "    " "           "*"            " "       
## 12  ( 1 ) "*" " "        " "    " "    " "           "*"            " "       
## 13  ( 1 ) "*" " "        " "    " "    " "           "*"            " "       
## 14  ( 1 ) "*" " "        "*"    " "    " "           "*"            " "       
## 15  ( 1 ) "*" "*"        "*"    " "    " "           "*"            " "       
## 16  ( 1 ) "*" "*"        "*"    " "    " "           "*"            " "       
##           FAVCyes FCVC NCP SCCyes SMOKEyes CH2O
## 1  ( 1 )  " "     " "  " " " "    " "      " " 
## 2  ( 1 )  " "     "*"  " " " "    " "      " " 
## 3  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 4  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 5  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 6  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 7  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 8  ( 1 )  "*"     "*"  "*" " "    " "      " " 
## 9  ( 1 )  "*"     "*"  "*" " "    " "      " " 
## 10  ( 1 ) "*"     "*"  "*" " "    " "      " " 
## 11  ( 1 ) "*"     "*"  "*" " "    " "      " " 
## 12  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
## 13  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
## 14  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
## 15  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
## 16  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
##           family_history_with_overweightno FAF TUE CAECFrequently CAECAlways
## 1  ( 1 )  " "                              " " " " " "            " "       
## 2  ( 1 )  " "                              " " " " " "            " "       
## 3  ( 1 )  " "                              " " " " " "            " "       
## 4  ( 1 )  " "                              " " " " " "            "*"       
## 5  ( 1 )  " "                              " " " " " "            "*"       
## 6  ( 1 )  " "                              " " " " "*"            "*"       
## 7  ( 1 )  " "                              " " " " "*"            "*"       
## 8  ( 1 )  " "                              " " " " "*"            "*"       
## 9  ( 1 )  " "                              " " " " "*"            "*"       
## 10  ( 1 ) " "                              " " " " "*"            "*"       
## 11  ( 1 ) " "                              " " " " "*"            "*"       
## 12  ( 1 ) " "                              " " " " "*"            "*"       
## 13  ( 1 ) "*"                              " " " " "*"            "*"       
## 14  ( 1 ) "*"                              " " " " "*"            "*"       
## 15  ( 1 ) "*"                              " " " " "*"            "*"       
## 16  ( 1 ) "*"                              "*" " " "*"            "*"       
##           CAECno MTRANSWalking MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## 1  ( 1 )  " "    " "           " "              " "             " "        "*"
## 2  ( 1 )  " "    " "           " "              " "             " "        "*"
## 3  ( 1 )  " "    " "           " "              " "             " "        "*"
## 4  ( 1 )  " "    " "           " "              " "             " "        "*"
## 5  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 6  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 7  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 8  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 9  ( 1 )  "*"    "*"           " "              " "             " "        "*"
## 10  ( 1 ) "*"    "*"           " "              " "             " "        "*"
## 11  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 12  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 13  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 14  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 15  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 16  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
bwd<-regsubsets(NObeyesdad~.,data=data,nvmax=16, method='backward')
summary(bwd)
## Subset selection object
## Call: regsubsets.formula(NObeyesdad ~ ., data = data, nvmax = 16, method = "backward")
## 24 Variables  (and intercept)
##                                  Forced in Forced out
## Age                                  FALSE      FALSE
## GenderMale                           FALSE      FALSE
## Height                               FALSE      FALSE
## Weight                               FALSE      FALSE
## CALCSometimes                        FALSE      FALSE
## CALCFrequently                       FALSE      FALSE
## CALCAlways                           FALSE      FALSE
## FAVCyes                              FALSE      FALSE
## FCVC                                 FALSE      FALSE
## NCP                                  FALSE      FALSE
## SCCyes                               FALSE      FALSE
## SMOKEyes                             FALSE      FALSE
## CH2O                                 FALSE      FALSE
## family_history_with_overweightno     FALSE      FALSE
## FAF                                  FALSE      FALSE
## TUE                                  FALSE      FALSE
## CAECFrequently                       FALSE      FALSE
## CAECAlways                           FALSE      FALSE
## CAECno                               FALSE      FALSE
## MTRANSWalking                        FALSE      FALSE
## MTRANSAutomobile                     FALSE      FALSE
## MTRANSMotorbike                      FALSE      FALSE
## MTRANSBike                           FALSE      FALSE
## BMI                                  FALSE      FALSE
## 1 subsets of each size up to 16
## Selection Algorithm: backward
##           Age GenderMale Height Weight CALCSometimes CALCFrequently CALCAlways
## 1  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 2  ( 1 )  " " " "        " "    " "    " "           " "            " "       
## 3  ( 1 )  " " " "        "*"    " "    " "           " "            " "       
## 4  ( 1 )  " " "*"        "*"    " "    " "           " "            " "       
## 5  ( 1 )  " " "*"        "*"    " "    " "           " "            " "       
## 6  ( 1 )  " " "*"        "*"    " "    " "           "*"            " "       
## 7  ( 1 )  " " "*"        "*"    " "    " "           "*"            " "       
## 8  ( 1 )  " " "*"        "*"    " "    " "           "*"            " "       
## 9  ( 1 )  " " "*"        "*"    " "    " "           "*"            " "       
## 10  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 11  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 12  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 13  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 14  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 15  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
## 16  ( 1 ) " " "*"        "*"    " "    " "           "*"            " "       
##           FAVCyes FCVC NCP SCCyes SMOKEyes CH2O
## 1  ( 1 )  " "     " "  " " " "    " "      " " 
## 2  ( 1 )  " "     "*"  " " " "    " "      " " 
## 3  ( 1 )  " "     "*"  " " " "    " "      " " 
## 4  ( 1 )  " "     "*"  " " " "    " "      " " 
## 5  ( 1 )  " "     "*"  " " " "    " "      " " 
## 6  ( 1 )  " "     "*"  " " " "    " "      " " 
## 7  ( 1 )  " "     "*"  " " " "    " "      " " 
## 8  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 9  ( 1 )  " "     "*"  "*" " "    " "      " " 
## 10  ( 1 ) " "     "*"  "*" "*"    " "      " " 
## 11  ( 1 ) " "     "*"  "*" "*"    " "      " " 
## 12  ( 1 ) " "     "*"  "*" "*"    " "      " " 
## 13  ( 1 ) " "     "*"  "*" "*"    " "      " " 
## 14  ( 1 ) " "     "*"  "*" "*"    " "      " " 
## 15  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
## 16  ( 1 ) "*"     "*"  "*" "*"    " "      " " 
##           family_history_with_overweightno FAF TUE CAECFrequently CAECAlways
## 1  ( 1 )  " "                              " " " " " "            " "       
## 2  ( 1 )  " "                              " " " " " "            " "       
## 3  ( 1 )  " "                              " " " " " "            " "       
## 4  ( 1 )  " "                              " " " " " "            " "       
## 5  ( 1 )  " "                              " " " " "*"            " "       
## 6  ( 1 )  " "                              " " " " "*"            " "       
## 7  ( 1 )  " "                              " " " " "*"            " "       
## 8  ( 1 )  " "                              " " " " "*"            " "       
## 9  ( 1 )  " "                              " " " " "*"            "*"       
## 10  ( 1 ) " "                              " " " " "*"            "*"       
## 11  ( 1 ) " "                              "*" " " "*"            "*"       
## 12  ( 1 ) "*"                              "*" " " "*"            "*"       
## 13  ( 1 ) "*"                              "*" " " "*"            "*"       
## 14  ( 1 ) "*"                              "*" " " "*"            "*"       
## 15  ( 1 ) "*"                              "*" " " "*"            "*"       
## 16  ( 1 ) "*"                              "*" " " "*"            "*"       
##           CAECno MTRANSWalking MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## 1  ( 1 )  " "    " "           " "              " "             " "        "*"
## 2  ( 1 )  " "    " "           " "              " "             " "        "*"
## 3  ( 1 )  " "    " "           " "              " "             " "        "*"
## 4  ( 1 )  " "    " "           " "              " "             " "        "*"
## 5  ( 1 )  " "    " "           " "              " "             " "        "*"
## 6  ( 1 )  " "    " "           " "              " "             " "        "*"
## 7  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 8  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 9  ( 1 )  " "    "*"           " "              " "             " "        "*"
## 10  ( 1 ) " "    "*"           " "              " "             " "        "*"
## 11  ( 1 ) " "    "*"           " "              " "             " "        "*"
## 12  ( 1 ) " "    "*"           " "              " "             " "        "*"
## 13  ( 1 ) "*"    "*"           " "              " "             " "        "*"
## 14  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 15  ( 1 ) "*"    "*"           " "              "*"             " "        "*"
## 16  ( 1 ) "*"    "*"           "*"              "*"             " "        "*"
set.seed(100515)
index<- sample(2,nrow(data),prob = c(0.8,0.2),replace=TRUE)
train_data=data[index==1,]
test_data<-data[index==2,]
print(dim(train_data))
## [1] 1688   18
print(dim(test_data)) 
## [1] 423  18
train_data <- na.omit(train_data)  
test_data <- na.omit(test_data) 
Model1 <- multinom(NObeyesdad ~., data = train_data)
## # weights:  182 (150 variable)
## initial  value 3284.696332 
## iter  10 value 2606.909180
## iter  20 value 1892.274122
## iter  30 value 1533.229167
## iter  40 value 1107.786517
## iter  50 value 871.147048
## iter  60 value 411.798632
## iter  70 value 176.335674
## iter  80 value 118.956333
## iter  90 value 81.502219
## iter 100 value 59.470026
## final  value 59.470026 
## stopped after 100 iterations
summary(Model1)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = NObeyesdad ~ ., data = train_data)
## 
## Coefficients:
##                     (Intercept)        Age  GenderMale     Height     Weight
## Overweight_Level_I    -182.3786  0.2529941  -6.4785744  -56.18251  0.9102755
## Overweight_Level_II   -211.3650  0.6120244  -7.2895805 -157.95321  2.4190094
## Obesity_Type_I        -232.2732  0.5558998  -9.7320449 -372.68321  4.3410508
## Insufficient_Weight    154.8807  0.3325656   0.7217008   16.16593 -0.2509415
## Obesity_Type_II       -315.5912  4.6296748  52.2773475 -708.72300  9.7179727
## Obesity_Type_III      -418.1942 -0.2880451 -69.8820954 -658.32970  9.4335899
##                     CALCSometimes CALCFrequently CALCAlways    FAVCyes
## Overweight_Level_I       1.132252      -1.083029 -22.836959   2.421359
## Overweight_Level_II     -2.741616       1.365960 -54.422484  -1.063541
## Obesity_Type_I          -5.087807      14.189148 -30.260675  12.922710
## Insufficient_Weight      2.280609       6.978922 -30.082807   3.237624
## Obesity_Type_II        -12.841075     -32.680757   6.395896 -34.555268
## Obesity_Type_III        66.152052      45.112703  14.785060  75.860435
##                           FCVC         NCP     SCCyes  SMOKEyes        CH2O
## Overweight_Level_I   -3.939712  -1.1948378  10.494900 -7.517610   0.5330821
## Overweight_Level_II  -5.015420  -2.1645005   8.219942 -2.051327   0.4835274
## Obesity_Type_I       -3.907522  -2.5717224  21.246439 13.459818   4.6597414
## Insufficient_Weight   1.108195   0.8966696  -1.712377  2.602751   1.0004423
## Obesity_Type_II     -16.681017 -10.9660861 -57.148410 28.789593 -16.4304130
## Obesity_Type_III     63.173260   2.3410196  77.497622 24.089598  -2.4320917
##                     family_history_with_overweightno        FAF          TUE
## Overweight_Level_I                        0.04288806  -2.808472   0.46743086
## Overweight_Level_II                      -2.89618701  -3.205545   2.48445960
## Obesity_Type_I                          -12.28770953  -1.874315   5.24067206
## Insufficient_Weight                      -0.11420639   1.962234   0.05586729
## Obesity_Type_II                          31.45269010 -13.305927   8.59870415
## Obesity_Type_III                         28.48463822 -23.632731 -18.53965520
##                     CAECFrequently CAECAlways     CAECno MTRANSWalking
## Overweight_Level_I       -4.732421  -9.646568  -6.396555   -0.04458058
## Overweight_Level_II      -4.286391 -18.199771  -1.313567  -12.01643706
## Obesity_Type_I          -15.312871 -20.839442 -56.786779  -15.28074955
## Insufficient_Weight       5.615335  -8.622133   9.596129    5.80202811
## Obesity_Type_II         -17.182757  -7.114496  17.118648   -6.58940771
## Obesity_Type_III        104.944912 -82.673399   4.821168   -8.71275412
##                     MTRANSAutomobile MTRANSMotorbike MTRANSBike       BMI
## Overweight_Level_I        -0.8613348       -5.084285    5.43197   8.95369
## Overweight_Level_II       -6.4947246     -198.931318 -243.61929  12.16491
## Obesity_Type_I            -5.5876684      -34.992783 -107.16045  18.66597
## Insufficient_Weight        0.9031987     -120.331380 -192.83667 -10.24804
## Obesity_Type_II          -39.3816337     -156.812556  -13.07749  21.23245
## Obesity_Type_III          19.4732124     -106.107607   82.04172  15.21788
## 
## Std. Errors:
##                     (Intercept)       Age GenderMale   Height    Weight
## Overweight_Level_I    5.0618044 0.1844556   2.850205 8.794632 0.1777486
## Overweight_Level_II   4.6924854 0.2164840   3.065959 8.099412 0.2312679
## Obesity_Type_I        3.0550627 0.4000755   6.508188 5.321353 0.3130731
## Insufficient_Weight   0.9235888 0.2818630   4.915184 1.607537 0.2961229
## Obesity_Type_II       2.2837843 0.7981901   8.736016 4.126346 0.4653116
## Obesity_Type_III      1.6462039 1.7703113   3.118178 2.989828 0.5904833
##                     CALCSometimes CALCFrequently   CALCAlways  FAVCyes     FCVC
## Overweight_Level_I       2.131739       3.473127          NaN 1.868712 1.415400
## Overweight_Level_II      2.447281       3.740452 8.427990e-12 2.183088 1.713347
## Obesity_Type_I           3.464669       5.527589          NaN 5.644219 3.776649
## Insufficient_Weight      4.767213       3.779049 3.694472e-15 4.132707 2.904407
## Obesity_Type_II          6.726885       7.887031          NaN 7.176887 5.206766
## Obesity_Type_III         3.104725       5.088797 0.000000e+00 1.646204 1.395979
##                           NCP       SCCyes SMOKEyes     CH2O
## Overweight_Level_I  0.7936564 2.273622e+00 4.881380 1.383474
## Overweight_Level_II 0.9619816 3.587324e+00 5.707932 1.596154
## Obesity_Type_I      2.5792073 5.887990e+00 5.761969 3.323166
## Insufficient_Weight 1.7777368 5.117697e+00 5.849366 2.868054
## Obesity_Type_II     3.9290328 5.887994e+00 5.885718 4.811075
## Obesity_Type_III    4.5223343 5.083847e-05 5.088770 7.489395
##                     family_history_with_overweightno       FAF      TUE
## Overweight_Level_I                          1.550134 0.7412366 1.290735
## Overweight_Level_II                         2.462920 0.9002774 1.549537
## Obesity_Type_I                              7.866363 3.1133119 3.351732
## Insufficient_Weight                         2.710056 1.2431082 1.848191
## Obesity_Type_II                             9.994182 4.3431789 4.110801
## Obesity_Type_III                            3.182921 2.9775669 7.449237
##                     CAECFrequently   CAECAlways       CAECno MTRANSWalking
## Overweight_Level_I        2.029075 2.295808e+00 5.275273e+00  2.085344e+00
## Overweight_Level_II       2.870639 5.670235e+00 4.575031e+00  4.108965e+00
## Obesity_Type_I            7.427007 3.376891e+00 7.517252e+00  7.474706e+00
## Insufficient_Weight       3.034330 4.189380e+00 6.409360e-05  1.620153e+01
## Obesity_Type_II           3.469713 4.385610e+00 5.885693e+00  9.735060e-24
## Obesity_Type_III          5.150046 1.495534e-44 6.216827e-06  4.178281e-08
##                     MTRANSAutomobile MTRANSMotorbike   MTRANSBike       BMI
## Overweight_Level_I          2.644876    3.753680e+00 4.525076e+00 0.6620998
## Overweight_Level_II         3.375591    8.748933e-14          NaN 0.5434728
## Obesity_Type_I              7.977993    3.730855e+00 1.517699e-14 1.0203203
## Insufficient_Weight         4.572389    6.585211e-16 4.856742e-16 1.1946205
## Obesity_Type_II             7.929581    4.047698e-29 1.080400e-30 1.2326828
## Obesity_Type_III            5.762076    4.380917e-20 2.067739e-36 1.7119664
## 
## Residual Deviance: 118.9401 
## AIC: 418.9401
test_predictions <- predict(Model1, newdata = test_data)

test_actual_classes <- test_data$NObeyesdad
multinom_accuracy <- mean(test_predictions == test_actual_classes)
print(paste("Testing Accuracy of the multinomial logistic regression model:", multinom_accuracy))
## [1] "Testing Accuracy of the multinomial logistic regression model: 0.945626477541371"
naive_model <- naiveBayes(NObeyesdad ~., data = train_data)
summary(naive_model)
##           Length Class  Mode     
## apriori    7     table  numeric  
## tables    17     -none- list     
## levels     7     -none- character
## isnumeric 17     -none- logical  
## call       4     -none- call
predictions <- predict(naive_model, test_data)

# Calculate accuracy
actual <- test_data$NObeyesdad  # The actual labels
correct <- sum(predictions == actual)  # Count correct predictions
nb_accuracy <- correct / length(actual)  # Calculate accuracy

# Print the accuracy
print(paste("Accuracy of the Naive Bayes model:", nb_accuracy))
## [1] "Accuracy of the Naive Bayes model: 0.936170212765957"
lda_model <- lda(NObeyesdad~., train_data)
summary(lda_model)
##         Length Class  Mode     
## prior     7    -none- numeric  
## counts    7    -none- numeric  
## means   168    -none- numeric  
## scaling 144    -none- numeric  
## lev       7    -none- character
## svd       6    -none- numeric  
## N         1    -none- numeric  
## call      3    -none- call     
## terms     3    terms  call     
## xlevels   8    -none- list
predictions <- predict(lda_model, newdata = test_data)
predicted_labels <- predictions$class
actual_labels <- test_data$NObeyesdad

lda_accuracy <- sum(predicted_labels == actual_labels) / length(actual_labels)

# Print the accuracy
print(paste("Accuracy of the LDA model:", lda_accuracy))
## [1] "Accuracy of the LDA model: 0.898345153664303"
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
actual <- test_data$NObeyesdad  # The actual labels
correct <- sum(predictions == actual)  # Count correct predictions
accuracy <- correct / length(actual)  # Calculate accuracy


# Impute missing values with mean
 


# Fit Random Forest model on imputed data
rf_model_imputed <- randomForest(NObeyesdad ~ ., data = train_data)

# Predict using Random Forest model on test data
rf_pred_imputed <- predict(rf_model_imputed, newdata = test_data)

# Evaluate Random Forest model on imputed data
rf_accuracy <- mean(rf_pred_imputed == test_data$NObeyesdad)
print(paste("Random Forest Accuracy (with imputed data):", rf_accuracy))
## [1] "Random Forest Accuracy (with imputed data): 0.99290780141844"
svm_model <- svm(NObeyesdad ~ ., data = train_data)

svm_pred <- predict(svm_model, newdata = test_data)

svm_accuracy <- mean(svm_pred == test_data$NObeyesdad)
print(paste("SVM Accuracy:", svm_accuracy))
## [1] "SVM Accuracy: 0.933806146572104"
library(rpart)
library(rpart.plot)
library(nnet)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
library(class)

# Fit the decision tree model using rpart
tree_model <- rpart(NObeyesdad ~ ., data = train_data, method = "class")

# Plot the decision tree
rpart.plot(tree_model, type = 4, extra = 101, fallen.leaves = TRUE)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 7 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.
tree_predictions <- predict(tree_model, newdata = test_data, type = "class")
tree_actual_classes <- test_data$NObeyesdad

# Calculate accuracy: proportion of correct predictions
tree_accuracy <- mean(tree_predictions == tree_actual_classes)
# Print the testing accuracy
print(paste("Testing Accuracy of the decision tree model:", tree_accuracy))
## [1] "Testing Accuracy of the decision tree model: 0.969267139479905"
rpart.plot(tree_model, type = 4, extra = 101, fallen.leaves = TRUE)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 7 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.

# Normalize the numeric features
preproc <- preProcess(train_data[, sapply(train_data, is.numeric)], method = c("center", "scale"))
train_data_norm <- predict(preproc, train_data)
test_data_norm <- predict(preproc, test_data)

nn_model <- nnet(NObeyesdad ~ ., data = train_data_norm, size = 3, maxit = 200)
## # weights:  103
## initial  value 3650.939100 
## iter  10 value 1411.263845
## iter  20 value 1073.807853
## iter  30 value 875.898472
## iter  40 value 669.258917
## iter  50 value 533.208453
## iter  60 value 446.811551
## iter  70 value 385.815436
## iter  80 value 362.376371
## iter  90 value 348.339412
## iter 100 value 299.648481
## iter 110 value 195.205075
## iter 120 value 149.561059
## iter 130 value 124.924237
## iter 140 value 115.659332
## iter 150 value 113.261008
## iter 160 value 108.820121
## iter 170 value 106.471205
## iter 180 value 104.421707
## iter 190 value 102.559832
## iter 200 value 101.989389
## final  value 101.989389 
## stopped after 200 iterations
# Predict using the neural network model on test data
nn_predictions <- predict(nn_model, newdata = test_data_norm, type = "class")

nn_predictions <- factor(nn_predictions, levels = levels(test_data$NObeyesdad))

# Calculate accuracy
nn_accuracy <- mean(nn_predictions == test_data$NObeyesdad)
print(paste("Neural Network Accuracy:", nn_accuracy))
## [1] "Neural Network Accuracy: 0.955082742316785"
# Calculate confusion matrix
conf_matrix <- confusionMatrix(nn_predictions, test_data$NObeyesdad)

# Calculate sensitivity and specificity
nn_sensitivity <- conf_matrix$byClass["Sensitivity"]
nn_specificity <- conf_matrix$byClass["Specificity"]

print(paste("Neural Network Sensitivity:", nn_sensitivity))
## [1] "Neural Network Sensitivity: NA"
print(paste("Neural Network Specificity:", nn_specificity))
## [1] "Neural Network Specificity: NA"
train_data_normalized <- as.data.frame(scale(train_data[ , sapply(train_data, is.numeric)]))
test_data_normalized <- as.data.frame(scale(test_data[ , sapply(test_data, is.numeric)]))

# Train a kNN model
knn_model <- knn(train = train_data_normalized, test = test_data_normalized, cl = train_data$NObeyesdad, k = 5)

# Convert predictions and actual values to factors with the same levels
knn_model <- factor(knn_model, levels = levels(test_data$NObeyesdad))

# Calculate accuracy for kNN model
knn_accuracy <- mean(knn_model == test_data$NObeyesdad)
print(paste("kNN Accuracy:", knn_accuracy))
## [1] "kNN Accuracy: 0.855791962174941"
# Calculate confusion matrix for kNN model
conf_matrix_knn <- confusionMatrix(knn_model, test_data$NObeyesdad)

# Calculate sensitivity and specificity for kNN model
knn_sensitivity <- conf_matrix_knn$byClass["Sensitivity"]
knn_specificity <- conf_matrix_knn$byClass["Specificity"]

print(paste("kNN Sensitivity:", knn_sensitivity))
## [1] "kNN Sensitivity: NA"
print(paste("kNN Specificity:", knn_specificity))
## [1] "kNN Specificity: NA"
# Combine all model accuracies for comparison
models <- c("Multinomial Logistic Regression", "Naive Bayes", "LDA", "Random Forest", "SVM", "Decision Tree", "Neural Network", "kNN")
test_accuracies <- c(multinom_accuracy, nb_accuracy, lda_accuracy, rf_accuracy, svm_accuracy, tree_accuracy, nn_accuracy, knn_accuracy)


# Create a data frame
accuracy_data <- data.frame(Model = models, Accuracy = test_accuracies)
library(RColorBrewer)

# Define a good color palette
palette_colors <- brewer.pal(n = length(unique(accuracy_data$Model)), name = "Set3")


# Sort the accuracy dataframe by Accuracy in descending order
sorted_accuracy_data <- accuracy_data[order(-accuracy_data$Accuracy), ]

# Print the sorted accuracy dataframe
print(sorted_accuracy_data)
##                             Model  Accuracy
## 4                   Random Forest 0.9929078
## 6                   Decision Tree 0.9692671
## 7                  Neural Network 0.9550827
## 1 Multinomial Logistic Regression 0.9456265
## 2                     Naive Bayes 0.9361702
## 5                             SVM 0.9338061
## 3                             LDA 0.8983452
## 8                             kNN 0.8557920
# Initialize vectors to store TP, TN, FP, FN
TP <- numeric(length(models))
TN <- numeric(length(models))
FP <- numeric(length(models))
FN <- numeric(length(models))

# Function to calculate TP, TN, FP, FN
calculate_metrics <- function(predictions, actual) {
  TP <- sum(predictions == actual & predictions == "Obesity_Type_I")
  TN <- sum(predictions != actual & predictions != "Obesity_Type_I")
  FP <- sum(predictions == "Obesity_Type_I" & actual != "Obesity_Type_I")
  FN <- sum(predictions != "Obesity_Type_I" & actual == "Obesity_Type_I")
  return(c(TP = TP, TN = TN, FP = FP, FN = FN))
}

# Calculate TP, TN, FP, FN for each model
for (i in 1:length(models)) {
  if (models[i] == "Multinomial Logistic Regression") {
    metrics <- calculate_metrics(test_predictions, test_actual_classes)
  } else if (models[i] == "Naive Bayes") {
    metrics <- calculate_metrics(predictions, actual)
  } else if (models[i] == "LDA") {
    metrics <- calculate_metrics(predicted_labels, actual_labels)
  } else if (models[i] == "Random Forest") {
    metrics <- calculate_metrics(rf_pred_imputed, test_data$NObeyesdad)
  } else if (models[i] == "SVM") {
    metrics <- calculate_metrics(svm_pred, test_data$NObeyesdad)
  } else if (models[i] == "Decision Tree") {
    metrics <- calculate_metrics(tree_predictions, tree_actual_classes)
  }
  
  TP[i] <- metrics["TP"]
  TN[i] <- metrics["TN"]
  FP[i] <- metrics["FP"]
  FN[i] <- metrics["FN"]
}

# Calculate sensitivity and specificity
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)

# Create a dataframe to store results
metrics_data <- data.frame(Model = models, Accuracy=test_accuracies,Sensitivity = sensitivity, Specificity = specificity)

sorted_metrics_data <- metrics_data %>% arrange(desc(Accuracy))

# Print the sorted metrics data frame
print(sorted_metrics_data)
##                             Model  Accuracy Sensitivity Specificity
## 1                   Random Forest 0.9929078    1.000000   1.0000000
## 2                   Decision Tree 0.9692671    0.937500   0.9230769
## 3                  Neural Network 0.9550827    0.937500   0.9230769
## 4 Multinomial Logistic Regression 0.9456265    0.921875   0.9565217
## 5                     Naive Bayes 0.9361702    0.000000   1.0000000
## 6                             SVM 0.9338061    0.937500   0.8928571
## 7                             LDA 0.8983452    0.906250   0.9534884
## 8                             kNN 0.8557920    0.937500   0.9230769
# Print the dataframe
print(metrics_data)
##                             Model  Accuracy Sensitivity Specificity
## 1 Multinomial Logistic Regression 0.9456265    0.921875   0.9565217
## 2                     Naive Bayes 0.9361702    0.000000   1.0000000
## 3                             LDA 0.8983452    0.906250   0.9534884
## 4                   Random Forest 0.9929078    1.000000   1.0000000
## 5                             SVM 0.9338061    0.937500   0.8928571
## 6                   Decision Tree 0.9692671    0.937500   0.9230769
## 7                  Neural Network 0.9550827    0.937500   0.9230769
## 8                             kNN 0.8557920    0.937500   0.9230769
ggplot(metrics_data, aes(x = Model, y = Sensitivity, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Sensitivity of Different Models",
       x = "Model", y = "Sensitivity") +
  scale_fill_manual(values = palette_colors) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot Specificity
ggplot(metrics_data, aes(x = Model, y = Specificity, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Specificity of Different Models",
       x = "Model", y = "Specificity") +
  scale_fill_manual(values = palette_colors) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot Accuracy (if not already included)
ggplot(metrics_data, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Accuracy of Different Models",
       x = "Model", y = "Accuracy") +
  scale_fill_manual(values = palette_colors) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Load necessary libraries

# Set seed for reproducibility
set.seed(123)

# Define the number of folds
k <- 5 
train_control <- trainControl(method = "cv", number = k)

# Function to perform k-fold cross-validation for different models
evaluate_model <- function(model_formula, method, train_data, train_control) {
  train(model_formula, data = train_data, method = method, trControl = train_control)
}

# Function to calculate metrics: accuracy, sensitivity, specificity
calculate_metrics <- function(predictions, actual) {
  conf_matrix <- confusionMatrix(predictions, actual)
  accuracy <- conf_matrix$overall["Accuracy"]
  return(c(accuracy))
}

# Initialize lists to store metrics for each model
model_names <- c("Multinomial Logistic Regression", "Naive Bayes", "LDA", "Random Forest", "SVM", "Decision Tree", "Neural Network", "kNN")
accuracies <- c()


# Multinomial Logistic Regression
multinom_model <- evaluate_model(NObeyesdad ~ ., "multinom", train_data, train_control)
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2022.032314
## iter  20 value 1476.583416
## iter  30 value 1130.422179
## iter  40 value 873.014855
## iter  50 value 598.325272
## iter  60 value 287.562506
## iter  70 value 111.021262
## iter  80 value 60.743904
## iter  90 value 43.007848
## iter 100 value 38.196909
## final  value 38.196909 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2022.084832
## iter  20 value 1480.441608
## iter  30 value 1157.901521
## iter  40 value 940.374510
## iter  50 value 768.841570
## iter  60 value 674.456690
## iter  70 value 654.967581
## iter  80 value 641.823023
## iter  90 value 635.637389
## iter 100 value 629.406062
## final  value 629.406062 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2022.032367
## iter  20 value 1476.587298
## iter  30 value 1130.452999
## iter  40 value 873.111503
## iter  50 value 599.158133
## iter  60 value 290.647944
## iter  70 value 134.039395
## iter  80 value 100.967698
## iter  90 value 89.746072
## iter 100 value 87.104075
## final  value 87.104075 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2626.978701 
## iter  10 value 2080.944367
## iter  20 value 1445.668432
## iter  30 value 1082.846426
## iter  40 value 843.809978
## iter  50 value 572.939272
## iter  60 value 283.817472
## iter  70 value 102.233932
## iter  80 value 62.729350
## iter  90 value 53.936195
## iter 100 value 46.971832
## final  value 46.971832 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2626.978701 
## iter  10 value 2081.031363
## iter  20 value 1451.884487
## iter  30 value 1116.563199
## iter  40 value 912.322646
## iter  50 value 759.804146
## iter  60 value 655.330044
## iter  70 value 635.162891
## iter  80 value 621.033980
## iter  90 value 612.253015
## iter 100 value 606.604510
## final  value 606.604510 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2626.978701 
## iter  10 value 2080.944454
## iter  20 value 1445.674700
## iter  30 value 1082.885861
## iter  40 value 843.908006
## iter  50 value 573.986096
## iter  60 value 286.964342
## iter  70 value 129.856370
## iter  80 value 102.594690
## iter  90 value 91.776011
## iter 100 value 87.523063
## final  value 87.523063 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2064.001874
## iter  20 value 1466.859383
## iter  30 value 1112.414341
## iter  40 value 856.578882
## iter  50 value 542.549274
## iter  60 value 295.561693
## iter  70 value 84.283450
## iter  80 value 53.332430
## iter  90 value 30.587717
## iter 100 value 15.081132
## final  value 15.081132 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2064.094789
## iter  20 value 1471.098043
## iter  30 value 1140.958381
## iter  40 value 916.617940
## iter  50 value 765.774797
## iter  60 value 676.937397
## iter  70 value 648.758164
## iter  80 value 632.700629
## iter  90 value 622.703277
## iter 100 value 617.407475
## final  value 617.407475 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2628.924611 
## iter  10 value 2064.001967
## iter  20 value 1466.863652
## iter  30 value 1112.446162
## iter  40 value 856.658245
## iter  50 value 543.163928
## iter  60 value 297.908165
## iter  70 value 116.022877
## iter  80 value 96.494080
## iter  90 value 85.566996
## iter 100 value 82.292844
## final  value 82.292844 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2632.816432 
## iter  10 value 2051.787661
## iter  20 value 1527.858728
## iter  30 value 1212.439635
## iter  40 value 883.763701
## iter  50 value 613.004629
## iter  60 value 280.073439
## iter  70 value 108.124422
## iter  80 value 50.876338
## iter  90 value 37.378869
## iter 100 value 32.414896
## final  value 32.414896 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2632.816432 
## iter  10 value 2051.840921
## iter  20 value 1530.661364
## iter  30 value 1234.112656
## iter  40 value 947.565063
## iter  50 value 785.193342
## iter  60 value 675.867146
## iter  70 value 644.442538
## iter  80 value 628.342206
## iter  90 value 621.867839
## iter 100 value 613.348798
## final  value 613.348798 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2632.816432 
## iter  10 value 2051.787714
## iter  20 value 1527.861549
## iter  30 value 1212.463626
## iter  40 value 883.854251
## iter  50 value 613.499688
## iter  60 value 283.696067
## iter  70 value 132.743227
## iter  80 value 98.283742
## iter  90 value 89.585819
## iter 100 value 87.253416
## final  value 87.253416 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2621.140971 
## iter  10 value 2082.735255
## iter  20 value 1502.977748
## iter  30 value 1107.099412
## iter  40 value 882.944761
## iter  50 value 592.586249
## iter  60 value 324.505584
## iter  70 value 119.986628
## iter  80 value 64.846638
## iter  90 value 47.442752
## iter 100 value 40.335023
## final  value 40.335023 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2621.140971 
## iter  10 value 2082.824393
## iter  20 value 1507.248312
## iter  30 value 1137.810343
## iter  40 value 948.454438
## iter  50 value 799.149281
## iter  60 value 688.321741
## iter  70 value 659.904019
## iter  80 value 642.706226
## iter  90 value 633.865181
## iter 100 value 628.888377
## final  value 628.888377 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 2621.140971 
## iter  10 value 2082.735345
## iter  20 value 1502.982046
## iter  30 value 1107.133822
## iter  40 value 883.037432
## iter  50 value 593.060514
## iter  60 value 327.423481
## iter  70 value 143.443113
## iter  80 value 109.867335
## iter  90 value 96.443348
## iter 100 value 92.889726
## final  value 92.889726 
## stopped after 100 iterations
## # weights:  182 (150 variable)
## initial  value 3284.696332 
## iter  10 value 2606.909296
## iter  20 value 1892.277477
## iter  30 value 1533.263093
## iter  40 value 1107.866374
## iter  50 value 871.352857
## iter  60 value 413.905956
## iter  70 value 193.877492
## iter  80 value 149.047500
## iter  90 value 119.994103
## iter 100 value 108.974723
## final  value 108.974723 
## stopped after 100 iterations
multinom_predictions <- predict(multinom_model, newdata = test_data)
metrics <- calculate_metrics(multinom_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])

# Naive Bayes
nb_model <- evaluate_model(NObeyesdad ~ ., "nb", train_data, train_control)
## Warning: model fit failed for Fold1: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) : 
##   Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold2: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) : 
##   Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECFrequently, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold3: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) : 
##   Zero variances for at least one class in variables: GenderMale, CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, SMOKEyes, family_history_with_overweightno, CAECFrequently, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold4: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) : 
##   Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold5: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) : 
##   Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
nb_predictions <- predict(nb_model, newdata = test_data)
metrics <- calculate_metrics(nb_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])


# Linear Discriminant Analysis (LDA)
lda_model <- evaluate_model(NObeyesdad ~ ., "lda", train_data, train_control)
## Warning: model fit failed for Fold3: parameter=none Error in lda.default(x, grouping, ...) : 
##   variable 7 appears to be constant within groups
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
lda_predictions <- predict(lda_model, newdata = test_data)
metrics <- calculate_metrics(lda_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])


# Random Forest
rf_model <- evaluate_model(NObeyesdad ~ ., "rf", train_data, train_control)
rf_predictions <- predict(rf_model, newdata = test_data)
metrics <- calculate_metrics(rf_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])

# Support Vector Machine (SVM)
svm_model <- evaluate_model(NObeyesdad ~ ., "svmRadial", train_data, train_control)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svm_predictions <- predict(svm_model, newdata = test_data)
metrics <- calculate_metrics(svm_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])


# Decision Tree
tree_model <- evaluate_model(NObeyesdad ~ ., "rpart", train_data, train_control)
tree_predictions <- predict(tree_model, newdata = test_data)
metrics <- calculate_metrics(tree_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])

# Neural Network
nn_model <- evaluate_model(NObeyesdad ~ ., "nnet", train_data_norm, train_control)
## # weights:  39
## initial  value 2757.052505 
## iter  10 value 2150.572080
## iter  20 value 1738.812687
## iter  30 value 1573.411806
## iter  40 value 1262.280002
## iter  50 value 943.750268
## iter  60 value 540.621389
## iter  70 value 311.800422
## iter  80 value 182.178722
## iter  90 value 149.138122
## iter 100 value 148.211097
## final  value 148.211097 
## stopped after 100 iterations
## # weights:  103
## initial  value 2812.049289 
## iter  10 value 1068.392086
## iter  20 value 715.177290
## iter  30 value 509.472409
## iter  40 value 366.865758
## iter  50 value 276.762521
## iter  60 value 208.662877
## iter  70 value 169.885574
## iter  80 value 112.340999
## iter  90 value 80.545602
## iter 100 value 63.320302
## final  value 63.320302 
## stopped after 100 iterations
## # weights:  167
## initial  value 2899.181146 
## iter  10 value 1283.911083
## iter  20 value 471.664967
## iter  30 value 341.265617
## iter  40 value 274.723808
## iter  50 value 240.575636
## iter  60 value 193.436357
## iter  70 value 129.370969
## iter  80 value 104.911829
## iter  90 value 82.549131
## iter 100 value 64.127179
## final  value 64.127179 
## stopped after 100 iterations
## # weights:  39
## initial  value 2871.408575 
## iter  10 value 2108.527502
## iter  20 value 2066.421141
## iter  30 value 1981.766469
## iter  40 value 1966.527595
## iter  50 value 1961.502114
## iter  60 value 1921.954233
## iter  70 value 1776.339246
## iter  80 value 1470.525051
## iter  90 value 1321.476920
## iter 100 value 1232.423641
## final  value 1232.423641 
## stopped after 100 iterations
## # weights:  103
## initial  value 2739.804436 
## iter  10 value 1604.616995
## iter  20 value 1163.885922
## iter  30 value 923.539212
## iter  40 value 818.855410
## iter  50 value 733.430644
## iter  60 value 657.426918
## iter  70 value 571.240473
## iter  80 value 537.379120
## iter  90 value 528.417072
## iter 100 value 522.578559
## final  value 522.578559 
## stopped after 100 iterations
## # weights:  167
## initial  value 2713.524814 
## iter  10 value 1067.198953
## iter  20 value 705.482302
## iter  30 value 577.084423
## iter  40 value 483.706011
## iter  50 value 440.465485
## iter  60 value 398.380065
## iter  70 value 364.898957
## iter  80 value 327.739235
## iter  90 value 307.986792
## iter 100 value 300.592349
## final  value 300.592349 
## stopped after 100 iterations
## # weights:  39
## initial  value 2673.060718 
## iter  10 value 1683.486928
## iter  20 value 1503.796444
## iter  30 value 1353.583265
## iter  40 value 1140.341397
## iter  50 value 1005.277751
## iter  60 value 964.898034
## iter  70 value 905.706952
## iter  80 value 784.450074
## iter  90 value 735.389156
## iter 100 value 692.023416
## final  value 692.023416 
## stopped after 100 iterations
## # weights:  103
## initial  value 2874.976897 
## iter  10 value 1028.377478
## iter  20 value 714.571280
## iter  30 value 492.812832
## iter  40 value 356.672487
## iter  50 value 273.887143
## iter  60 value 186.504953
## iter  70 value 131.881460
## iter  80 value 107.648105
## iter  90 value 99.608466
## iter 100 value 91.756917
## final  value 91.756917 
## stopped after 100 iterations
## # weights:  167
## initial  value 2905.094099 
## iter  10 value 1292.929354
## iter  20 value 750.743715
## iter  30 value 469.328958
## iter  40 value 354.793480
## iter  50 value 285.550544
## iter  60 value 235.723471
## iter  70 value 201.220640
## iter  80 value 185.255934
## iter  90 value 175.031737
## iter 100 value 148.429877
## final  value 148.429877 
## stopped after 100 iterations
## # weights:  39
## initial  value 2748.705423 
## iter  10 value 1944.827380
## iter  20 value 1720.510218
## iter  30 value 1662.463335
## iter  40 value 1637.943100
## iter  50 value 1619.487103
## iter  60 value 1597.760294
## iter  70 value 1456.264795
## iter  80 value 1208.634245
## iter  90 value 908.677922
## iter 100 value 575.503797
## final  value 575.503797 
## stopped after 100 iterations
## # weights:  103
## initial  value 2871.198574 
## iter  10 value 1274.581335
## iter  20 value 866.216198
## iter  30 value 654.808645
## iter  40 value 461.192525
## iter  50 value 325.701488
## iter  60 value 183.181852
## iter  70 value 113.549794
## iter  80 value 102.657361
## iter  90 value 101.245801
## iter 100 value 98.202964
## final  value 98.202964 
## stopped after 100 iterations
## # weights:  167
## initial  value 2696.206235 
## iter  10 value 1071.220057
## iter  20 value 640.476537
## iter  30 value 353.227603
## iter  40 value 223.640902
## iter  50 value 164.263198
## iter  60 value 109.006257
## iter  70 value 71.817934
## iter  80 value 54.098330
## iter  90 value 42.288665
## iter 100 value 37.120007
## final  value 37.120007 
## stopped after 100 iterations
## # weights:  39
## initial  value 2723.347436 
## iter  10 value 2430.862254
## iter  20 value 2176.361441
## iter  30 value 2017.692920
## iter  40 value 1893.568984
## iter  50 value 1532.022670
## iter  60 value 1386.314009
## iter  70 value 1299.649531
## iter  80 value 1231.529229
## iter  90 value 1229.083280
## iter 100 value 1227.208123
## final  value 1227.208123 
## stopped after 100 iterations
## # weights:  103
## initial  value 2737.683875 
## iter  10 value 1522.685271
## iter  20 value 1258.101035
## iter  30 value 908.945481
## iter  40 value 776.987774
## iter  50 value 653.338881
## iter  60 value 596.527797
## iter  70 value 555.425433
## iter  80 value 546.058938
## iter  90 value 541.109106
## iter 100 value 530.419401
## final  value 530.419401 
## stopped after 100 iterations
## # weights:  167
## initial  value 2963.507050 
## iter  10 value 1076.173598
## iter  20 value 730.381698
## iter  30 value 613.489591
## iter  40 value 505.152481
## iter  50 value 449.603823
## iter  60 value 421.466071
## iter  70 value 394.800503
## iter  80 value 377.292908
## iter  90 value 358.930148
## iter 100 value 323.206335
## final  value 323.206335 
## stopped after 100 iterations
## # weights:  39
## initial  value 2841.000445 
## iter  10 value 1908.034664
## iter  20 value 1768.864552
## iter  30 value 1672.265744
## iter  40 value 1458.990458
## iter  50 value 1088.849708
## iter  60 value 852.627679
## iter  70 value 603.280297
## iter  80 value 464.086668
## iter  90 value 461.989253
## iter 100 value 453.246295
## final  value 453.246295 
## stopped after 100 iterations
## # weights:  103
## initial  value 2664.912689 
## iter  10 value 1551.506638
## iter  20 value 962.782487
## iter  30 value 763.270148
## iter  40 value 562.277169
## iter  50 value 472.327950
## iter  60 value 414.696474
## iter  70 value 383.160165
## iter  80 value 327.458469
## iter  90 value 243.050555
## iter 100 value 217.961731
## final  value 217.961731 
## stopped after 100 iterations
## # weights:  167
## initial  value 2780.815996 
## iter  10 value 1010.293085
## iter  20 value 670.078253
## iter  30 value 448.713707
## iter  40 value 351.929993
## iter  50 value 259.888352
## iter  60 value 187.271020
## iter  70 value 133.209864
## iter  80 value 102.143769
## iter  90 value 90.037575
## iter 100 value 80.783701
## final  value 80.783701 
## stopped after 100 iterations
## # weights:  39
## initial  value 2852.520803 
## iter  10 value 2221.282414
## iter  20 value 1769.773123
## iter  30 value 1532.943874
## iter  40 value 1259.111054
## iter  50 value 1135.557569
## iter  60 value 1086.620492
## iter  70 value 1073.385542
## iter  80 value 1041.007305
## iter  90 value 1035.204517
## iter 100 value 1028.294454
## final  value 1028.294454 
## stopped after 100 iterations
## # weights:  103
## initial  value 2857.596157 
## iter  10 value 1377.506551
## iter  20 value 888.123076
## iter  30 value 675.626814
## iter  40 value 563.182397
## iter  50 value 444.304347
## iter  60 value 339.784813
## iter  70 value 269.971720
## iter  80 value 214.511717
## iter  90 value 168.270644
## iter 100 value 132.484321
## final  value 132.484321 
## stopped after 100 iterations
## # weights:  167
## initial  value 2940.093879 
## iter  10 value 847.627737
## iter  20 value 384.706461
## iter  30 value 207.033658
## iter  40 value 133.152555
## iter  50 value 88.004151
## iter  60 value 65.437889
## iter  70 value 52.251595
## iter  80 value 45.781035
## iter  90 value 41.901719
## iter 100 value 39.699935
## final  value 39.699935 
## stopped after 100 iterations
## # weights:  39
## initial  value 2763.269372 
## iter  10 value 2103.053080
## iter  20 value 1740.303652
## iter  30 value 1537.419219
## iter  40 value 1387.209150
## iter  50 value 1330.104410
## iter  60 value 1239.876163
## iter  70 value 1225.702776
## iter  80 value 1225.196102
## iter  90 value 1225.150793
## iter 100 value 1225.108040
## final  value 1225.108040 
## stopped after 100 iterations
## # weights:  103
## initial  value 2851.731745 
## iter  10 value 2250.353877
## iter  20 value 1561.479348
## iter  30 value 1328.112294
## iter  40 value 1176.020881
## iter  50 value 957.608664
## iter  60 value 834.973881
## iter  70 value 729.027137
## iter  80 value 657.980954
## iter  90 value 628.036760
## iter 100 value 607.700042
## final  value 607.700042 
## stopped after 100 iterations
## # weights:  167
## initial  value 2871.806202 
## iter  10 value 1151.192354
## iter  20 value 623.563067
## iter  30 value 487.631306
## iter  40 value 420.717367
## iter  50 value 383.176857
## iter  60 value 352.115642
## iter  70 value 328.511207
## iter  80 value 309.469306
## iter  90 value 294.801778
## iter 100 value 286.851755
## final  value 286.851755 
## stopped after 100 iterations
## # weights:  39
## initial  value 2688.079010 
## iter  10 value 2065.121972
## iter  20 value 1846.800317
## iter  30 value 1737.153769
## iter  40 value 1555.730385
## iter  50 value 1267.025265
## iter  60 value 1028.589046
## iter  70 value 811.976446
## iter  80 value 746.832450
## iter  90 value 739.130230
## iter 100 value 715.321695
## final  value 715.321695 
## stopped after 100 iterations
## # weights:  103
## initial  value 2706.163709 
## iter  10 value 1538.836257
## iter  20 value 1029.796927
## iter  30 value 830.898753
## iter  40 value 648.817329
## iter  50 value 501.356096
## iter  60 value 389.105638
## iter  70 value 318.450691
## iter  80 value 284.681443
## iter  90 value 260.098509
## iter 100 value 181.102637
## final  value 181.102637 
## stopped after 100 iterations
## # weights:  167
## initial  value 2795.360875 
## iter  10 value 1404.120376
## iter  20 value 908.803423
## iter  30 value 714.063021
## iter  40 value 565.808340
## iter  50 value 456.982911
## iter  60 value 366.381865
## iter  70 value 316.089929
## iter  80 value 274.795289
## iter  90 value 250.171126
## iter 100 value 235.917155
## final  value 235.917155 
## stopped after 100 iterations
## # weights:  39
## initial  value 2733.169920 
## iter  10 value 2430.354371
## iter  20 value 2254.682694
## iter  30 value 2150.114628
## iter  40 value 2111.424317
## iter  50 value 2019.843786
## iter  60 value 1948.007851
## iter  70 value 1911.358129
## iter  80 value 1895.482979
## iter  90 value 1887.829742
## iter 100 value 1875.235772
## final  value 1875.235772 
## stopped after 100 iterations
## # weights:  103
## initial  value 2871.766380 
## iter  10 value 1274.854730
## iter  20 value 830.040996
## iter  30 value 611.562586
## iter  40 value 477.116741
## iter  50 value 370.461419
## iter  60 value 319.854184
## iter  70 value 301.766885
## iter  80 value 290.542627
## iter  90 value 281.357433
## iter 100 value 273.047597
## final  value 273.047597 
## stopped after 100 iterations
## # weights:  167
## initial  value 2774.518860 
## iter  10 value 1202.079898
## iter  20 value 612.305458
## iter  30 value 422.103274
## iter  40 value 292.452799
## iter  50 value 188.723707
## iter  60 value 120.191487
## iter  70 value 90.430640
## iter  80 value 63.889884
## iter  90 value 51.972584
## iter 100 value 43.975424
## final  value 43.975424 
## stopped after 100 iterations
## # weights:  39
## initial  value 2737.814998 
## iter  10 value 2463.573895
## iter  20 value 2257.171306
## iter  30 value 1809.239064
## iter  40 value 1502.988420
## iter  50 value 1349.677804
## iter  60 value 1275.252891
## iter  70 value 1256.465634
## iter  80 value 1250.020619
## iter  90 value 1244.107433
## iter 100 value 1233.486017
## final  value 1233.486017 
## stopped after 100 iterations
## # weights:  103
## initial  value 2732.667339 
## iter  10 value 1344.312053
## iter  20 value 1070.522165
## iter  30 value 940.755339
## iter  40 value 847.902191
## iter  50 value 723.871097
## iter  60 value 667.542108
## iter  70 value 635.589121
## iter  80 value 609.948981
## iter  90 value 589.383439
## iter 100 value 554.231545
## final  value 554.231545 
## stopped after 100 iterations
## # weights:  167
## initial  value 2935.615373 
## iter  10 value 1096.106574
## iter  20 value 780.886425
## iter  30 value 599.424205
## iter  40 value 487.925572
## iter  50 value 451.477339
## iter  60 value 423.986072
## iter  70 value 394.669452
## iter  80 value 367.059912
## iter  90 value 319.483322
## iter 100 value 294.078953
## final  value 294.078953 
## stopped after 100 iterations
## # weights:  39
## initial  value 2753.766514 
## iter  10 value 2399.389917
## iter  20 value 1852.713748
## iter  30 value 1456.352549
## iter  40 value 1288.198863
## iter  50 value 1091.413813
## iter  60 value 769.135038
## iter  70 value 632.991918
## iter  80 value 568.093165
## iter  90 value 561.215988
## iter 100 value 532.383445
## final  value 532.383445 
## stopped after 100 iterations
## # weights:  103
## initial  value 2724.248440 
## iter  10 value 980.694957
## iter  20 value 649.057481
## iter  30 value 515.000328
## iter  40 value 376.805712
## iter  50 value 273.665936
## iter  60 value 189.291631
## iter  70 value 129.803307
## iter  80 value 110.028231
## iter  90 value 94.918235
## iter 100 value 86.799179
## final  value 86.799179 
## stopped after 100 iterations
## # weights:  167
## initial  value 2956.321489 
## iter  10 value 1022.480651
## iter  20 value 440.081374
## iter  30 value 301.519024
## iter  40 value 208.508049
## iter  50 value 156.980922
## iter  60 value 121.836317
## iter  70 value 90.136512
## iter  80 value 75.087575
## iter  90 value 57.145056
## iter 100 value 49.501246
## final  value 49.501246 
## stopped after 100 iterations
## # weights:  39
## initial  value 2750.045778 
## iter  10 value 2576.108313
## iter  20 value 2134.538394
## iter  30 value 1733.210237
## iter  40 value 1613.057489
## iter  50 value 1567.196421
## iter  60 value 1378.326177
## iter  70 value 1226.160346
## iter  80 value 1015.601750
## iter  90 value 1004.331873
## iter 100 value 954.712703
## final  value 954.712703 
## stopped after 100 iterations
## # weights:  103
## initial  value 2746.069054 
## iter  10 value 1372.240919
## iter  20 value 849.981844
## iter  30 value 660.421859
## iter  40 value 500.304491
## iter  50 value 346.112209
## iter  60 value 247.302449
## iter  70 value 180.235376
## iter  80 value 131.538535
## iter  90 value 109.047934
## iter 100 value 103.024935
## final  value 103.024935 
## stopped after 100 iterations
## # weights:  167
## initial  value 2750.810358 
## iter  10 value 882.060029
## iter  20 value 476.765450
## iter  30 value 353.395835
## iter  40 value 243.066177
## iter  50 value 181.575949
## iter  60 value 133.178789
## iter  70 value 94.147916
## iter  80 value 71.058744
## iter  90 value 59.413151
## iter 100 value 52.137165
## final  value 52.137165 
## stopped after 100 iterations
## # weights:  39
## initial  value 2708.153667 
## iter  10 value 2312.770437
## iter  20 value 2161.409688
## iter  30 value 2014.330515
## iter  40 value 1864.888166
## iter  50 value 1737.283096
## iter  60 value 1672.330721
## iter  70 value 1583.566306
## iter  80 value 1471.780011
## iter  90 value 1391.969118
## iter 100 value 1306.601186
## final  value 1306.601186 
## stopped after 100 iterations
## # weights:  103
## initial  value 2746.801478 
## iter  10 value 1209.715425
## iter  20 value 875.801863
## iter  30 value 729.353353
## iter  40 value 635.302362
## iter  50 value 581.971450
## iter  60 value 540.751459
## iter  70 value 524.755005
## iter  80 value 517.609307
## iter  90 value 514.358196
## iter 100 value 510.907397
## final  value 510.907397 
## stopped after 100 iterations
## # weights:  167
## initial  value 2944.416634 
## iter  10 value 965.112681
## iter  20 value 675.734704
## iter  30 value 532.570343
## iter  40 value 402.711856
## iter  50 value 333.380926
## iter  60 value 293.894844
## iter  70 value 270.123118
## iter  80 value 260.981664
## iter  90 value 256.925483
## iter 100 value 253.426019
## final  value 253.426019 
## stopped after 100 iterations
## # weights:  39
## initial  value 2744.661370 
## iter  10 value 2260.914389
## iter  20 value 2055.847344
## iter  30 value 1914.314561
## iter  40 value 1847.482344
## iter  50 value 1744.499684
## iter  60 value 1700.814245
## iter  70 value 1629.808616
## iter  80 value 1616.009616
## iter  90 value 1610.275253
## iter 100 value 1597.714788
## final  value 1597.714788 
## stopped after 100 iterations
## # weights:  103
## initial  value 2692.487997 
## iter  10 value 1454.385590
## iter  20 value 879.943161
## iter  30 value 705.380339
## iter  40 value 570.637094
## iter  50 value 458.831183
## iter  60 value 366.009173
## iter  70 value 283.038278
## iter  80 value 191.668380
## iter  90 value 133.085836
## iter 100 value 114.111456
## final  value 114.111456 
## stopped after 100 iterations
## # weights:  167
## initial  value 2799.174982 
## iter  10 value 983.947216
## iter  20 value 439.332487
## iter  30 value 193.668475
## iter  40 value 105.396817
## iter  50 value 75.092310
## iter  60 value 64.815405
## iter  70 value 59.775512
## iter  80 value 55.786585
## iter  90 value 53.289424
## iter 100 value 52.603977
## final  value 52.603977 
## stopped after 100 iterations
## # weights:  167
## initial  value 3825.463387 
## iter  10 value 1350.520046
## iter  20 value 956.334548
## iter  30 value 768.968219
## iter  40 value 645.754467
## iter  50 value 585.899658
## iter  60 value 543.647593
## iter  70 value 513.283876
## iter  80 value 482.516668
## iter  90 value 437.100200
## iter 100 value 345.671509
## final  value 345.671509 
## stopped after 100 iterations
nn_predictions <- predict(nn_model, newdata = test_data)
metrics <- calculate_metrics(nn_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])


# k-Nearest Neighbors (kNN)
# Preprocess the data for normalization
preproc <- preProcess(train_data[, sapply(train_data, is.numeric)], method = c("center", "scale"))
train_data_norm <- predict(preproc, train_data)
test_data_norm <- predict(preproc, test_data)

# kNN model (tuneGrid for k values)
knn_model <- train(NObeyesdad ~ ., data = train_data_norm, method = "knn", trControl = train_control, tuneGrid = expand.grid(.k = 5))
knn_predictions <- predict(knn_model, newdata = test_data_norm)
metrics <- calculate_metrics(knn_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])

# Combine all metrics into a data frame
metrics_data <- data.frame(
  Model = model_names,
  Accuracy = accuracies

)

# Print the metrics data frame
print(metrics_data)
##                             Model  Accuracy
## 1 Multinomial Logistic Regression 0.9669031
## 2                     Naive Bayes 0.7706856
## 3                             LDA 0.8983452
## 4                   Random Forest 0.9881797
## 5                             SVM 0.9007092
## 6                   Decision Tree 0.5839243
## 7                  Neural Network 0.1725768
## 8                             kNN 0.8747045
# Sort the data frame based on accuracies
sorted_metrics_data <- metrics_data %>% arrange(desc(Accuracy))

# Print the sorted metrics data frame
print(sorted_metrics_data)
##                             Model  Accuracy
## 1                   Random Forest 0.9881797
## 2 Multinomial Logistic Regression 0.9669031
## 3                             SVM 0.9007092
## 4                             LDA 0.8983452
## 5                             kNN 0.8747045
## 6                     Naive Bayes 0.7706856
## 7                   Decision Tree 0.5839243
## 8                  Neural Network 0.1725768
# Plot the accuracies
ggplot(sorted_metrics_data, aes(x =Accuracy, y = Model, fill = Model)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Accuracy of Different Models",
       x = "Model",
       y = "Accuracy") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")+theme(axis.text.x = element_text(angle = 45, hjust = 1))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.